**************************
***	RUN USING STATA 16 ***
**************************

* Purpose: simulate pass rates based on test difficulty for other WASSCE countries
* Last updated: 20 Apr 2023

*---------------------------------------------------------------------------------------
* Simulate for The Gambia	
*---------------------------------------------------------------------------------------

*** prepare parameters from IRT

	use "$output\mat_obj_par.dta", clear
	
    keep item disc diff
	ren disc a
	ren diff b1

	tempfile o
	save `o'
	
	use "$output\mat_sub_par.dta", clear	

		append using `o'
		
		//update values for constructed response categories
		forvalues k = 1/7 {
			replace v`k' = v`k'*2
		}

		order item, first
		
		drop options		
		egen options = rownonmiss(b1- b7)
			
		//create year indicator
		gen yr = substr(item,3,1)
		destring yr, replace
		replace yr = yr+2010
			
		*remove TIMSS items for now, simplifies sorting
		gen length=length(item)
		drop if length==6
		drop length
		
		//simulate students
		drop x
		sort item
		gen x=_n
		tempfile parms
		save `parms'
		
*-------------------------------------------------------------------------------
* setup loop for distributions
*-------------------------------------------------------------------------------

		use `parms', clear
		set seed 60420
		forvalues s = 1/1000 {
		gen s_`s' = rnormal(-0.4,1)

				
		gen st_`s' = s_`s' if x==1
		egen ability_`s' = mean(st_`s')
		drop s_* st_*
		}
		
		forvalues t = 1/1000 {
			
			gen n1_`t' = exp(a*(ability_`t'-b1))
			gen n2_`t' = exp(a*((2*ability_`t')-b1-b2))
			gen n3_`t' = exp(a*((3*ability_`t')-b1-b2-b3))
			gen n4_`t' = exp(a*((4*ability_`t')-b1-b2-b3-b4))
			gen n5_`t' = exp(a*((5*ability_`t')-b1-b2-b3-b4-b5))
			gen n6_`t' = exp(a*((6*ability_`t')-b1-b2-b3-b4-b5-b6))
			gen n7_`t' = exp(a*((7*ability_`t')-b1-b2-b3-b4-b5-b6-b7))
			
			gen d_`t' = 1 + n1_`t' + n2_`t' if options==2
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' if options==3
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' if options==4
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' if options==5
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' if options==6
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' + n7_`t' if options==7
		
			gen p1_`t' = n1_`t'/d_`t'
			gen p2_`t' = n2_`t'/d_`t'
			gen p3_`t' = n3_`t'/d_`t'
			gen p4_`t' = n4_`t'/d_`t'
			gen p5_`t' = n5_`t'/d_`t'
			gen p6_`t' = n6_`t'/d_`t'
			gen p7_`t' = n7_`t'/d_`t'
		
			gen exp_`t' = v1*p1_`t'
			replace exp_`t' = exp_`t' + v2*p2_`t' 	if p2_`t'!=.
			replace exp_`t' = exp_`t' + v3*p3_`t' 	if p3_`t'!=.
			replace exp_`t' = exp_`t' + v4*p4_`t' 	if p4_`t'!=.
			replace exp_`t' = exp_`t' + v5*p5_`t' 	if p5_`t'!=.
			replace exp_`t' = exp_`t' + v6*p6_`t' 	if p6_`t'!=.
			replace exp_`t' = exp_`t' + v7*p7_`t' 	if p7_`t'!=.
		}
		
		
		forvalues t = 1/1000 {
		gen p0_`t' = (exp(a*(ability_`t'-b1)))/(1+exp(a*(ability_`t'-b1))) if options==1
		}
		
		forvalues u = 1/1000 {
		egen scr_s_`u' = sum(exp_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		egen scr_o_`u' = sum(p0_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		gen tot`u' = scr_s_`u' + scr_o_`u' 
		}
		
**calculate total marks on offer for each year - accounts for omitted objective and subjective items
	gen type = substr(item, 1, 1)
	gen location = substr(item, 4,2)
	destring location, replace
	gen marks = 1
	replace marks = 8 if type=="s" & location<=5
	replace marks = 12 if type== "s" & location>5

	egen marks_yr = sum(marks), by(yr)
	
	**these marks_yr already incorporate missing items from objective tests
	**make manual adjustment for seleced subjective items which are not currently scored from 8 or 12
	**see 'subjective marks available.xlsx'
	replace marks_yr = marks_yr - 9 if yr==2011
	replace marks_yr = marks_yr - 8 if yr==2012
	replace marks_yr = marks_yr - 5 if yr==2015
	replace marks_yr = marks_yr - 2 if yr==2016
	replace marks_yr = marks_yr - 11 if yr==2018 /*NOTE: for 2018, item s_808 is omitted from analysis already, so 12 points already deducted*/

	*keep only one observation by year and look at distributions
	egen pick = tag(yr)
	keep if pick==1
	keep yr tot* marks_yr ability*
	
	//generate shares reaching certain thresholds	
	*assumed normal distribution to start
	reshape long tot ability_, i(yr) j(t)
	
	drop t //remove unnecessary
	sort yr ability_ 
	bysort yr : gen n = _n //define rank order
	gen pct = tot/marks_yr //gen percent score
		
*-------------------------------------------------------------------------------
* use test difficulty from Ghana simulation (see 06 math distribution do-file)
*-------------------------------------------------------------------------------
	gen jp_99 = 0.1947428
	gen jc_99 = 0.276595		
	
*-------------------------------------------------------------------------------
* identify who is passing
*-------------------------------------------------------------------------------		
	gen c_99 = (pct>=jc_99) //identify credit passes
	gen p_99 = (pct>=jp_99 & c_99!=1) //identify passes that are not with credit
	
	collapse (sum) c_* p_*, by(yr)
	reshape long c_ p_, i(yr) j(cal)
	
	replace cal = cal+2000
	replace c_ = c_/10  //put into %
	replace p_ = p_/10  //put into %
	gen all_pass = c_ + p_
	
	label var c_ "% passing with credit"
	label var p_ "% passing without credit"
	label var all_pass "% passing"
		
	
*** graph Gambia simulation
	graph bar c_ p_, over(yr, gap(30) label(labsize(10pt) angle(90))) stack ///
	bar(1, fcolor(black) fi(100) lcolor(none)) ///
	bar(2, fcolor(white) fi(100) lw(vthin) lcolor(black)) ///
	legend(lab(1 "Credit or better") lab(2 "Pass") order(2 1) pos(6) col(2) size(10pt)) ///
	subtitle("Panel B: The Gambia", span pos(11) size(10pt)) ///
	ylabel(0 "0%" 20 "20%" 40 "40%" 60 "60%" 80 "80%" 100 "100%", labsize(10pt)) name(gmb, replace)
		
*---------------------------------------------------------------------------------------
* Simulate for Nigeria
*---------------------------------------------------------------------------------------

*** prepare parameters from IRT
	use "$output\mat_obj_par.dta", clear
	
    keep item disc diff
	ren disc a
	ren diff b1

	tempfile o
	save `o'
	
	use "$output\mat_sub_par.dta", clear	

		append using `o'
		
		//update values for constructed response categories
		forvalues k = 1/7 {
			replace v`k' = v`k'*2
		}

		order item, first
		
		drop options		
		egen options = rownonmiss(b1- b7)
			
		//create year indicator
		gen yr = substr(item,3,1)
		destring yr, replace
		replace yr = yr+2010
			
		*remove TIMSS items for now, simplifies sorting
		gen length=length(item)
		drop if length==6
		drop length
		
		//simulate students
		drop x
		sort item
		gen x=_n
		tempfile parms
		save `parms'
		
*-------------------------------------------------------------------------------
* setup loop for distributions
*-------------------------------------------------------------------------------

		use `parms', clear
		set seed 60420
		forvalues s = 1/1000 {
		gen s_`s' = rnormal(2.2,1.4)
		
		gen st_`s' = s_`s' if x==1
		egen ability_`s' = mean(st_`s')
		drop s_* st_*
		}
		
		forvalues t = 1/1000 {
			
			gen n1_`t' = exp(a*(ability_`t'-b1))
			gen n2_`t' = exp(a*((2*ability_`t')-b1-b2))
			gen n3_`t' = exp(a*((3*ability_`t')-b1-b2-b3))
			gen n4_`t' = exp(a*((4*ability_`t')-b1-b2-b3-b4))
			gen n5_`t' = exp(a*((5*ability_`t')-b1-b2-b3-b4-b5))
			gen n6_`t' = exp(a*((6*ability_`t')-b1-b2-b3-b4-b5-b6))
			gen n7_`t' = exp(a*((7*ability_`t')-b1-b2-b3-b4-b5-b6-b7))
			
			gen d_`t' = 1 + n1_`t' + n2_`t' if options==2
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' if options==3
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' if options==4
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' if options==5
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' if options==6
			replace d_`t' = 1 + n1_`t' + n2_`t' + n3_`t' + n4_`t' + n5_`t' + n6_`t' + n7_`t' if options==7
		
			gen p1_`t' = n1_`t'/d_`t'
			gen p2_`t' = n2_`t'/d_`t'
			gen p3_`t' = n3_`t'/d_`t'
			gen p4_`t' = n4_`t'/d_`t'
			gen p5_`t' = n5_`t'/d_`t'
			gen p6_`t' = n6_`t'/d_`t'
			gen p7_`t' = n7_`t'/d_`t'
		
			gen exp_`t' = v1*p1_`t'
			replace exp_`t' = exp_`t' + v2*p2_`t' 	if p2_`t'!=.
			replace exp_`t' = exp_`t' + v3*p3_`t' 	if p3_`t'!=.
			replace exp_`t' = exp_`t' + v4*p4_`t' 	if p4_`t'!=.
			replace exp_`t' = exp_`t' + v5*p5_`t' 	if p5_`t'!=.
			replace exp_`t' = exp_`t' + v6*p6_`t' 	if p6_`t'!=.
			replace exp_`t' = exp_`t' + v7*p7_`t' 	if p7_`t'!=.
		}
		
		
		forvalues t = 1/1000 {
		gen p0_`t' = (exp(a*(ability_`t'-b1)))/(1+exp(a*(ability_`t'-b1))) if options==1
		}
		
		forvalues u = 1/1000 {
		egen scr_s_`u' = sum(exp_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		egen scr_o_`u' = sum(p0_`u'), by(yr)
		}
		
		forvalues u = 1/1000 {
		gen tot`u' = scr_s_`u' + scr_o_`u' 
		}
		
**calculate total marks on offer for each year - accounts for omitted objective and subjective items
	gen type = substr(item, 1, 1)
	gen location = substr(item, 4,2)
	destring location, replace
	gen marks = 1
	replace marks = 8 if type=="s" & location<=5
	replace marks = 12 if type== "s" & location>5

	egen marks_yr = sum(marks), by(yr)
	
	**these marks_yr already incorporate missing items from objective tests
	**make manual adjustment for seleced subjective items which are not currently scored from 8 or 12
	**see 'subjective marks available.xlsx'
	replace marks_yr = marks_yr - 9 if yr==2011
	replace marks_yr = marks_yr - 8 if yr==2012
	replace marks_yr = marks_yr - 5 if yr==2015
	replace marks_yr = marks_yr - 2 if yr==2016
	replace marks_yr = marks_yr - 11 if yr==2018 /*NOTE: for 2018, item s_808 is omitted from analysis already, so 12 points already deducted*/

	*keep only one observation by year and look at distributions
	egen pick = tag(yr)
	keep if pick==1
	keep yr tot* marks_yr ability*
	
	//generate shares reaching certain thresholds	
	*assumed normal distribution to start
	reshape long tot ability_, i(yr) j(t)
	
	drop t //remove unnecessary
	sort yr ability_ 
	bysort yr : gen n = _n //define rank order
	gen pct = tot/marks_yr //gen percent score
		
*-------------------------------------------------------------------------------
* use test difficulty from Ghana simulation (see 06 math distribution do-file)
*-------------------------------------------------------------------------------
	gen jp_99 = 0.1947428
	gen jc_99 = 0.276595	
	
*-------------------------------------------------------------------------------
* identify who is passing
*-------------------------------------------------------------------------------		
	gen c_99 = (pct>=jc_99) //identify credit passes
	gen p_99 = (pct>=jp_99 & c_99!=1) //identify passes that are not with credit
	
	collapse (sum) c_* p_*, by(yr)
	reshape long c_ p_, i(yr) j(cal)
	
	replace cal = cal+2000
	replace c_ = c_/10  //put into %
	replace p_ = p_/10  //put into %
	gen all_pass = c_ + p_
	
	label var c_ "% passing with credit"
	label var p_ "% passing without credit"
	label var all_pass "% passing"
	
*** graph Nigeria simulation
	graph bar c_ p_, over(yr, gap(30) label(labsize(10pt) angle(90))) stack ///
	bar(1, fcolor(black) fi(100) lcolor(none)) ///
	bar(2, fcolor(white) fi(100) lw(vthin) lcolor(black)) ///
	legend(lab(1 "Credit or better") lab(2 "Pass") order(2 1) pos(6) col(2) size(10pt)) ///
	subtitle("Panel A: Nigeria", span pos(11) size(10pt)) ///
	ylabel(0 "0%" 20 "20%" 40 "40%" 60 "60%" 80 "80%" 100 "100%", labsize(10pt)) name(nga, replace)	
	
	grc1leg nga gmb, col(2) name(combo, replace)
	
	graph display combo, xsize(4.5) ysize(3)
	graph export "$graph\5.png", replace
	graph export "$graph\5.eps", replace
				
		